************************************************************
************************************************************
**														  **
**	Tanzania, 2012-13									  **
**										   				  **
**										   				  **
************************************************************
************************************************************

clear
version 14
set more off
cap log close

gl IN 	"${DATADRIVE}\LSMS-ISA\2012-13"
gl OUT 	"${LOCDRIVE}\Results"
gl TEMP	"${LOCDRIVE}\Tempdata"

loc hh y3_hhid
loc ea ea
loc weight y3_weight
loc strata strataid
loc admin0 macroregion
loc admin1 region 		
loc admin2 district  	
loc admin3 ward
loc indiv indidy3

set matsize 10000

loc admin0_base = $admin0_base
loc admin1_base = $admin1_base
loc admin2_base = $admin2_base
loc admin3_base = $admin3_base

loc gooddisagglist $gooddisagglist 
loc servicedisagglist $servicedisagglist

log using "${TEMP}\DataPrep_Tanzania_R3", replace


*********************************
**** Create Sample Files ****
*********************************/

use  "${IN}\HH_SEC_A.dta", clear
sort `hh'

rename y3_rural urban
recode urban (0=1) (1=0) /*was coded as 0 urban 1 rural*/
label define yesno 1 yes 0 no
label values urban yesno

rename hh_a01_1 `admin1'
rename hh_a02_1 `admin2'
rename hh_a03_1 `admin3'
rename hh_a04_1 `ea'

replace `ea' = 12 if y3_cluster=="09-2-013-012"
*replace `hh' = "1877-001" if `hh'=="1875-105" --> added to BID in 2019 but I don't think this matters
*replace `hh' = "2432-006" if `hh'=="2431-105" --> added to BID in 2019 but I don't think this matters
sort `hh'

drop if `ea'==. | `admin3'==. /*8 obs, not qutie sure why missing, but they cause problems with the merges and we can't fill*/

gen macroregion=1 if region==1 | region==13		 /*central*/
replace macroregion=2 if region==2 | region==3 | region==4 | region==21	/* northern*/
replace macroregion=3 if region==5 | region==6 | region==7		/*eastern*/
replace macroregion=4 if region==8 | region==9 | region==10	 /*southern*/
replace macroregion=5 if region==11 | region==12 | region==15	/*southern highlands*/
replace macroregion=6 if region==14 | region==16 | region==17	/*western*/
replace macroregion=7 if region==18 | region==19 | region==20	/*lake*/
replace macroregion=8 if region>=51 & region<=55	/*zanzibar*/

tab macroregion,m
label define macroregion 1 "Central" 2 "Northern" 3 "Eastern" 4 "Southern" 5 "Southern Highlands" 6 "Western" 7 "Lakes" 8 "Zanzibar"
label values macroregion macroregion 

keep `hh' `ea' `weight' `strata' `admin0' `admin1' `admin2' `admin3' clusterid y3_cluster urban
tempfile sample
save `sample', replace

g count = 1
collapse (sum) count (median) urban_ea=urban, by(`ea' `admin0' `admin1' `admin2' `admin3')
drop count
sort `admin0' `admin1' `admin2' `admin3' `ea'
egen ea_c = group(`admin0' `admin1' `admin2' `admin3' `ea')
tempfile sample_ea
save `sample_ea', replace

g count = 1
collapse (sum) count (median) urban_adm3=urban_ea, by(`admin0' `admin1' `admin2' `admin3')
drop count
recode urban_adm3 (0.5=1)
sort `admin0' `admin1' `admin2' `admin3'
egen admin3_c = group(`admin0' `admin1' `admin2' `admin3')
su admin3_c if `admin0' == `admin0_base' & `admin1' == `admin1_base' & `admin2' == `admin2_base' & `admin3'==`admin3_base', detail
loc admin3_cbase = r(p50)
display "`admin3_cbase'"
tempfile sample_admin3
save `sample_admin3', replace

g count = 1
collapse (sum) count, by(`admin0' `admin1' `admin2')
drop count
sort `admin0' `admin1' `admin2' 
egen admin2_c = group(`admin0' `admin1' `admin2')
su admin2_c if `admin0' == `admin0_base' & `admin1' == `admin1_base' & `admin2' == `admin2_base', detail
loc admin2_cbase = r(p50)
display "`admin2_cbase'"
tempfile sample_admin2
save `sample_admin2', replace

g count = 1
collapse (sum) count, by(`admin0' `admin1')
drop count
sort `admin0' `admin1' 
egen admin1_c = group(`admin0' `admin1')
su admin1_c if `admin0' == `admin0_base' & `admin1' == `admin1_base', detail
loc admin1_cbase = r(p50)
display "`admin1_cbase'"
tempfile sample_admin1
save `sample_admin1', replace

g count = 1
collapse (sum) count, by(`admin0')
drop count
sort `admin0' 
egen admin0_c = group(`admin0')
su admin0_c if `admin0' == `admin0_base', detail
loc admin0_cbase = r(p50)
display "`admin0_cbase'"
tempfile sample_admin0
save `sample_admin0', replace
*


** Create a community level merge -- this is messy
* y3 cluster variable is basically a coding of id1_id2_id3_id4. Can find this match strategy in old versions but the y2 approach is better

use "${IN}\COM_SEC_A1A2.dta", clear
keep y3_cluster id_01 id_02 id_03 id_04
replace y3_cluster="07-2-102-74" if y3_cluster=="07-2-102-047" & id_04==74
replace y3_cluster="07-3-202-006" if y3_cluster=="07-3-202-031" & id_04==6
replace y3_cluster="07-3-202-030" if y3_cluster=="07-3-202-031" & id_04==30
isid y3_cluster
tempfile y3comm_admlink
save `y3comm_admlink', replace

use `sample', clear
list `admin1' `admin2' `admin3' `ea' if y3_cluster=="07-2-102-047"
list `admin1' `admin2' `admin3' `ea' if y3_cluster=="07-3-202-031" /*this one is weird it has district 2 and 3 hhs from many different wards*/
merge m:1 `admin0' `admin1' `admin2' `admin3' `ea' using `sample_ea', nogen keepusing(ea_c)
merge m:1 `admin0' `admin1' `admin2' `admin3' using `sample_admin3', nogen keepusing(admin3_c)
merge m:1 y3_cluster using `y3comm_admlink', nogen keep(1 3) //931 unmatched hhs out of 500 - possibly splitoffs where no community data collected?
egen id_03_c = group(id_03 id_02 id_01)
tempfile commlink_hh
save `commlink_hh', replace

collapse (median) admin3_c `admin3' `admin2' `admin1' `admin0' (min) min_adm3=admin3_c (max) max_adm3=admin3_c (sd) sd_adm3=admin3_c, by(id_03_c id_03 id_02 id_01) 

levelsof id_03_c if min_adm3 != max_adm3, loc(checklist)
display "`checklist'"

use `commlink_hh', clear
foreach id_c in `checklist' {
    display "`id_c'"
	tabulate admin3_c if id_03_c==`id_c', sort matrow(mat)
	replace admin3_c=mat[1,1] if id_03_c==`id_c'
	foreach l in 3 2 1 0 {
	    qui: su `admin`l'' if admin3_c==mat[1,1], detail
	    replace `admin`l'' = r(p50) if id_03_c==`id_c'
		}
	}
	
collapse (median) admin3_c `admin3' `admin2' `admin1' `admin0', by(id_03_c id_03 id_02 id_01) 

tempfile comm_to_adm3_link
save `comm_to_adm3_link', replace
save "${TEMP}/Community_merge_r3.dta", replace


** Household sample file

use `sample', clear
merge m:1 `admin0' `admin1' `admin2' `admin3' `ea' using `sample_ea', nogen keep(1 3) keepusing(ea_c)
merge m:1 `admin0' `admin1' `admin2' `admin3' using `sample_admin3', nogen keep(1 3) keepusing(admin3_c)
merge m:1 `admin0' `admin1' `admin2' using `sample_admin2', nogen keep(1 3) keepusing(admin2_c)
merge m:1 `admin0' `admin1' using `sample_admin1', nogen keep(1 3) keepusing(admin1_c)
merge m:1 `admin0' using `sample_admin0', nogen keep(1 3) keepusing(admin0_c)

save "${TEMP}/Sample_links_R3.dta", replace 


****************************
**** Expenditures FOOD  ****
****************************

** First generate month_year from the hh interview date

use  "${IN}\HH_SEC_A.dta", clear

rename hh_a18_3 year_int
rename hh_a18_2 month_int

keep `hh' year_int month_int

sort year_int month_int

tempfile intdate
save `intdate', replace


use "${IN}\HH_SEC_J1.dta", clear
label list hh_j01
tab hh_j01,m
recode hh_j01 2=0 

label list hh_j02_1
/*1 KILOGRAMS 2 GRAMS 3 LITRE 4 MILLILITRE 5 PIECES*/

rename 	(hh_j01 	hh_j02_1 	hh_j02_2  	hh_j03_1 		hh_j03_2  		hh_j04 	hh_j05_1  	hh_j05_2 	hh_j06_1  			hh_j06_2) ///
		(eat 		unit_total 	q_total 	unit_purchase 	q_purchase 		spend 	unit_prod 	q_prod 				unit_gift 			q_gift)
 
duplicates report `hh' itemcode 
foreach i in total purchase prod gift {
   recode unit_`i' 0=.
   }
   
*data consistency check
gen ck0=(q_total==0 | q_total==.) if eat==1 //2 obs found (case 1: purchased and showed expenditure but not q_purchase or q_total -- hh is 0846-001 and item is 809.) (case 2: no info prob didn't consume -- hh is 1443-001 item is 601)

replace eat=0 if `hh'=="1443-001" & itemcode==601

tab ck0
g unitv = spend/q_purchase 
su unitv if unit_purchase==2 & itemcode==809, detail 
loc pr = r(p50)
replace q_purchase= spend/`pr' if `hh'=="0846-001" & item==809 // this is the obs that violates ck0. 
replace unit_purchase = unit_total if `hh'=="0846-001" & item==809
replace q_total=q_purchase if `hh'=="0846-001" & item==809
drop unitv

replace eat=0 if ck0==1 & q_total==0 //no obs
foreach i in total purchase prod gift {  
   replace unit_`i'=. if ck0==1 & q_total==0  //no changes made
   }

replace q_total= q_gift if (q_gift!=.) & (q_gift > 0) & q_total == .  //no obs 
gen ck_=(q_total!=.) if eat==0
tab ck_  //no obs 

foreach i in purchase prod gift { 
	gen ck_0`i'=(unit_`i'==. & q_`i' !=.) //unit is missing but q is not missing
	replace ck_0`i'=0 if unit_`i'==. & q_`i' ==0 //unit is missing but q is not 0
	gen ck_1`i'=(unit_`i'!=. & q_`i'==.) //q is missing but unit is not missing
	  
	tab ck_0`i',m // there is one ck_0gift error
	tab ck_1`i',m 
	}

*fix the errors
replace unit_purchase=unit_total if ck_0purchase==1 //0 change
replace unit_prod=unit_total if ck_0prod==1 //0 change
replace unit_gift=unit_total if ck_0gift==1 //1 change
replace unit_prod=. if ck_1prod==1 //0 change

gen ck_2=0
replace ck_2=1 if spend==0 & q_purchase>0 
tab ck_2 
replace unit_gift=unit_total if ck_2==1  //2 obs
replace q_gift=q_total if ck_2==1 & q_prod==0 //0 obs
	replace q_gift=q_total - q_prod if ck_2==1 & q_prod>0 //1 obs
replace q_purchase=. if ck_2==1 //2 obs
replace unit_purchase=. if ck_2==1 //2 obs
replace spend=. if ck_2==1 //2 obs

*replace missing quantity with 0
foreach i in total purchase prod gift {
	recode q_`i' (.=0)
	}

drop ck0 ck_*

*convert units (grams->kilograms/ millilitre->litre)
tab unit_total

foreach i in total purchase prod gift { 
	replace q_`i'=q_`i'/1000 if unit_`i'==2 | unit_`i'==4 
	replace unit_`i'=1 if unit_`i'==2
	replace unit_`i'=3 if unit_`i'==4
	}
	
tempfile sec_j1_mid
save `sec_j1_mid', replace


*check if units for all sources are equal within an item 
rename (unit_total q_total unit_purchase q_purchase unit_prod q_prod unit_gift q_gift) ///
(unit_1 q_1 unit_2 q_2 unit_3 q_3 unit_4 q_4)
reshape long unit_ q_,i(`hh' itemcode) j(source)
drop if q_==0
by `hh' itemcode, sort: egen sd_= sd(unit_)
g diff= sd_ != . & sd_ > 0 /*these have only one type of cons per hh-item*/
drop sd_
tab diff //5 cases of differing units within a hh-item. These don't seem to be errors.)


** Purchase unit values (expenditures / quantity)
use `sec_j1_mid', clear

gen price_purchase=spend/q_purchase
g ch_=1 if price_purchase==. & q_purchase!=. & q_purchase>0 /*only 2 - q_purchase is same as q_total but spend is just missing. Treat these as gifts instead */
replace q_gift=q_purchase if ch_==1 
replace unit_gift=unit_purchase if ch_==1 
replace q_purchase=. if ch_==1
replace unit_purchase=. if ch==1

sort `hh'
merge m:1 `hh' using `sample'
drop _merge 

tempfile sec_j1_mid
save `sec_j1_mid', replace

keep if q_total>0 & q_total != . 
tempfile food_consumed_consumption
save `food_consumed_consumption', replace
save ${TEMP}/food_consumed_consumption_r3.dta, replace

** Generate median prices per kg from price_purchase (only for consumers)

keep if q_purchase > 0 & q_purchase != . 
tempfile food_purchased_consumption
save `food_purchased_consumption', replace

*by urb-region-district-ward-ea all units
preserve
collapse (median) p_purchase_adm4_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1' `admin2' `admin3' ea)
keep if p_purchase_adm4_ != . //drop missing p drawn from unit_purchase=. 
sort itemcode unit_purchase urban `admin1' `admin2' `admin3' ea
reshape wide p_purchase_adm4_, i(itemcode urban `admin0' `admin1' `admin2' `admin3' ea) j(unit_purchase)
tempfile p_purchase_adm4 
save `p_purchase_adm4', replace
restore

*generate median unit price by urb-region-district-ward
preserve
collapse (median) p_purchase_adm3_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1' `admin2' `admin3')
keep if p_purchase_adm3_ != .
sort itemcode unit_purchase urban `admin0' `admin1' `admin2' `admin3'
reshape wide p_purchase_adm3_, i(itemcode urban `admin0' `admin1' `admin2' `admin3') j(unit_purchase)
tempfile p_purchase_adm3
save `p_purchase_adm3', replace
restore

*generate median unit price by urb-region-district
preserve
collapse (median) p_purchase_adm2_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1' `admin2')
keep if p_purchase_adm2_ != .
sort itemcode unit_purchase urban `admin0' `admin1' `admin2'
reshape wide p_purchase_adm2_, i(itemcode urban `admin0' `admin1' `admin2') j(unit_purchase)
tempfile p_purchase_adm2
save `p_purchase_adm2', replace
restore

*generate median unit price by urb-region
preserve
collapse (median) p_purchase_adm1_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1')
keep if p_purchase_adm1_ != .
duplicates drop
sort itemcode unit_purchase urban `admin0' `admin1' 
reshape wide p_purchase_adm1_, i(itemcode urban `admin0' `admin1') j(unit_purchase)
tempfile p_purchase_adm1
save `p_purchase_adm1', replace
restore

*generate median unit price by macroregion
preserve
collapse (median) p_purchase_adm0_=price_purchase, by(itemcode unit_purchase urban `admin0')
keep if p_purchase_adm0_ != .
duplicates drop
sort itemcode unit_purchase urban `admin0' 
reshape wide p_purchase_adm0_, i(itemcode urban `admin0') j(unit_purchase)
tempfile p_purchase_adm0
save `p_purchase_adm0', replace
restore

*generate median unit price by urban
preserve
collapse (median) p_purchase_urbrur_=price_purchase, by(itemcode unit_purchase urban)
keep if p_purchase_urbrur_ != .
duplicates drop
sort itemcode unit_purchase urban
reshape wide p_purchase_urbrur_, i(itemcode urban) j(unit_purchase)
tempfile p_purchase_urbrur
save `p_purchase_urbrur', replace
restore

*generate median unit price by unit
preserve
collapse (median) p_purchase_unit_=price_purchase, by(itemcode unit_purchase)
keep if p_purchase_unit_ != .
duplicates drop
sort itemcode unit_purchase
reshape wide p_purchase_unit_, i(itemcode) j(unit_purchase)
tempfile p_purchase_unit
save `p_purchase_unit', replace
restore


** Merge prices for different levels from price_purchase with consumption info **

*Notes: In this section only keep housheold who consumed that item
*       Information about non-consuming households is not relevant for unit conversion. 

use `food_consumed_consumption', clear
tab q_total if unit_total == . //0 obs 
rename unit_total unit

keep `hh' itemcode unit ea `admin3' `admin2' `admin1' `admin0' urban  
tempfile price_sample
save `price_sample', replace

merge m:1 itemcode ea `admin3' `admin2' `admin1' `admin0' urban using `p_purchase_adm4'
drop _merge
merge m:1 itemcode `admin3' `admin2' `admin1' `admin0' urban using `p_purchase_adm3' 
drop _merge
merge m:1 itemcode `admin2' `admin1' `admin0' urban using `p_purchase_adm2' 
drop _merge
merge m:1 itemcode `admin1' `admin0' urban using `p_purchase_adm1'
drop _merge
merge m:1 itemcode `admin0' urban using `p_purchase_adm0'
drop _merge
merge m:1 itemcode urban using `p_purchase_urbrur'
drop _merge
merge m:1 itemcode using `p_purchase_unit'
drop _merge

foreach lev in adm4 adm3 adm2 adm1 adm0 urbrur unit {
	foreach u in 1 3 5 {
		winsor2 p_purchase_`lev'_`u' if unit==`u', suffix(imp) by(itemcode) cuts(5 95)
		g dif = p_purchase_`lev'_`u'!= p_purchase_`lev'_`u'imp & p_purchase_`lev'_`u'!=. & unit==`u'
		display "level: `lev'; unit: `u'; p_purchase_`lev'_`u'"
		bysort itemcode: su p_purchase_`lev'_`u' p_purchase_`lev'_`u'imp if dif==1
		replace p_purchase_`lev'_`u' = p_purchase_`lev'_`u'imp if dif==1
		drop dif
		}
	}

drop p_purchase*imp
	
tempfile prices
save `prices', replace


** Now fill in missing prices

use `sec_j1_mid', clear
rename unit_total unit
merge m:m `hh' itemcode unit using `prices'
tab q_total if _merge==1 
tab q_total if _merge==2
drop if _merge==1  // 228,152 observation with no total consumption; these do not matter for the unit conversion. 
drop _merge

* generate median level price 
foreach u in 1 3 5 {
	gen price_`u'_med = p_purchase_adm4_`u' 
	}

foreach u in 1 3 5 {
	* keep median price only if there are 3 or more households that consumed that item with the same unit in the same ea
	bys itemcode unit ea `admin3' `admin2' `admin1' `admin0' urban: egen n_`u'_adm4=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm4 
	replace price_`u'_med =. if n_`u'_adm4<3
	
	* if ea level is missing, replace with ward level prices
	bys itemcode unit `admin3' `admin2' `admin1' `admin0' urban: egen n_`u'_adm3=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm3 
	replace price_`u'_med = p_purchase_adm3_`u' if price_`u'_med == . & n_`u'_adm3 >= 3 

	* if ea and ward level are missing, replace with district level prices
	bys itemcode unit `admin2' `admin1' `admin0' urban: egen n_`u'_adm2=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm2 
	replace price_`u'_med = p_purchase_adm2_`u' if price_`u'_med == . & n_`u'_adm2 >= 3 

	* if ea, ward and district level are missing, replace with region level prices
	bys itemcode unit `admin1' `admin0' urban: egen n_`u'_adm1=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm1 
	replace price_`u'_med = p_purchase_adm1_`u' if price_`u'_med == . & n_`u'_adm1 >= 3 

	* if ea, ward, district and region level are missing, replace with macro-region level prices
	bys itemcode unit `admin0' urban: egen n_`u'_adm0=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm0 
	replace price_`u'_med = p_purchase_adm0_`u' if price_`u'_med == . & n_`u'_adm0 >= 3 

	* if ea, ward, district, region and macro-region level are missing, replace with urban/rural level prices
	bys itemcode unit urban: egen n_`u'_urbrur=count(price_purchase!=.) if unit==`u'
	tab n_`u'_urbrur 
	replace price_`u'_med = p_purchase_urbrur_`u' if price_`u'_med == . & n_`u'_urbrur >= 3 

	* if all else missing, replace with unit prices
	bys itemcode unit: egen n_`u'_unit=count(price_purchase!=.) if unit==`u'
	tab n_`u'_unit 
	replace price_`u'_med = p_purchase_unit_`u' if price_`u'_med == . & n_`u'_unit >= 3 
	}

foreach u in 1 3 5 {
	count if price_`u'_med == . & unit==`u'
	** 1 for kg; 3 for liter; 5 for pieces;
	drop if price_`u'_med == . & unit==`u'
	
	winsor2 price_`u'_med if unit==`u', suffix(imp) by(itemcode) cuts(5 95)
	replace price_`u'_med=price_`u'_medimp
	}

drop p_purchase_adm4_1 - p_purchase_unit_5 *medimp
drop n_1_adm4 - n_5_unit

tempfile consumed_consumption
save `consumed_consumption', replace


** Now save price dataset

collapse (median) price_1_med price_3_med price_5_med, by(`admin0' `admin1' `admin2' `admin3' `ea' itemcode)

tempfile prices_purchased
save `prices_purchased', replace


** Convert to most common unit

use `consumed_consumption', clear

count if unit!=1 & unit!=3 //  739 obs with non-standard unit  
* Identify the most common unit for each item. In the final dataset we will convert to kg, for now liters and pieces are still OK
gen kg=1 if unit==1
gen lit=1 if unit==3
gen pie=1 if unit==5
collapse (sum) nkg=kg nlit=lit npie=pie, by(itemcode) // count the number of units of each item

*compare the number of units within item and find the the most common one:
gen common_unit=.
replace common_unit=1 if nkg>=nlit & nkg>=npie
replace common_unit=3 if nlit>nkg & nlit>npie
replace common_unit=5 if npie>nkg & npie>nlit

tab common_unit
count if common_unit==.

label var common_unit "most common unit per item: 1=kg, 3=liter, 5=pieces"
sort common_unit
by common_unit: list itemcode nkg nlit npie // check whether we find the most common unit correctly - looks good

drop nkg nlit npie 

tempfile commonunit_w1
save `commonunit_w1', replace

* Now convert to most common unit
use  `consumed_consumption', clear
merge m:1 itemcode using `commonunit_w1'
drop _m

* Convert unit of total consumption to the most common unit for each item  *
gen nonst = (unit!=1 & unit!=3) // 739 obs
count if nonst==1 & common_unit==1 & price_1_med==. /*17*/
count if nonst==1 & common_unit==3 & price_3_med==. /*0*/
count if nonst==1 & common_unit==5 & price_5_med==. /*0*/

* Use the median price to get an approximate value of total consumption. When consumption reported in non-standard units, we convert using the median price at the EA or higher level. We use this price to get the value of total consumption if there is a non-standard unit of consumption.
g spend_temp = q_total * price_1_med if unit==1
replace spend_temp = q_total * price_3_med if unit==3
replace spend_temp = q_total * price_5_med if unit==5

* For items whose most common unit is kg, covert all units to kg [and for liters, pieces as well]
levelsof itemcode, local(items)
foreach item in `items' {
	foreach u in 1 3 5 {
		replace q_total = spend_temp / price_`u'_med if itemcode==`item' & unit!=`u' & common_unit==`u'
		replace q_purchase = spend_temp / price_`u'_med if itemcode==`item' & unit!=`u' & common_unit==`u'
		replace unit=`u' if itemcode==`item' & common_unit==`u'
		}
	}

tempfile consumed_consumption_mid2_t
save `consumed_consumption_mid2_t', replace


** exclude outliers of quantity per capita consumed (pq_total) and of quantity purchased (pq_purchase) and of price, using interquartile 

merge m:1 `hh' using "${IN}\ConsumptionNPS3.dta", nogen keepusing(adulteq hhsize)

g price_med = price_1_med if common_unit==1
replace price_med = price_3_med if common_unit==3
replace price_med = price_5_med if common_unit==5

foreach j in total purchase { 
	g qpc_`j' = q_`j' / adulteq
	}

levelsof itemcode, local(items)
foreach item in `items' {
	tab itemcode if itemcode==`item'
	foreach j in total purchase {	
		su qpc_`j' if itemcode==`item' & qpc_`j'>0 & qpc_`j'!=., detail /*excludes zero from the lower bound*/
		replace qpc_`j' = r(p95) if itemcode==`item' & qpc_`j'>0 & qpc_`j'!=. & qpc_`j'>r(p95)
		replace qpc_`j' = r(p5) if itemcode==`item' & qpc_`j'>0 & qpc_`j'!=. & qpc_`j'<r(p5)
		}
	su price_purchase if itemcode==`item' & q_purchase>0 & q_purchase!=., detail 
	replace price_purchase = r(p95) if itemcode==`item' & q_purchase>0 & q_purchase!=. & price_purchase>r(p95)
	replace price_purchase = r(p5) if itemcode==`item' & q_purchase>0 & q_purchase!=. & price_purchase<r(p5)
	}

replace spend=q_total*price_purchase if price_purchase != .
replace spend=q_total*price_med if price_purchase==. & spend==. & q_total > 0 & q_total!=.

duplicates tag `hh' itemcode, gen(dup)
tab dup // no duplicates 

drop price_1_med price_3_med price_5_med

tempfile consumed_consumption_mid2 
save `consumed_consumption_mid2', replace 


**  Now restore obverations for non-consumers   

use `sample', clear
merge 1:m `hh' using `consumed_consumption_mid2'

tab _merge
drop if _merge!=3 //none dropped 
drop _merge 

count if unit==common_unit /*this is true for all obs*/
count

*restore obs for households who did not consume (assume missing means non-consumption)
*drop if itemcode==.
duplicates drop // 0 obs

rename spend foodcons

keep `hh' `admin0' `admin1' `admin2' `admin3' `ea' itemcode  eat q_total q_purchase foodcons unit foodcons price_purchase price_med
destring unit, replace 

*We reshape the data to wide so that we can imput q_total=0, foodcons=0 for each item

drop if itemcode==. | unit==.

levelsof itemcode, local(items)

reshape wide eat q_total q_purchase unit foodcons price_purchase price_med , i(`hh') j(itemcode)

foreach i in `items' {
	replace eat`i'=0              if eat`i'==.
	replace q_total`i'=0          if q_total`i'==.
	replace foodcons`i'=0         if foodcons`i'==.
	replace q_purchase`i'=0       if q_purchase`i'==.
	}
	
*Reshape data back to long
reshape long eat q_total q_purchase unit foodcons price_purchase price_med, i(`hh') j(itemcode)
drop if q_total==.

codebook itemcode // 42 items 
tab itemcode

g foodgroup = .
label define foodgroups 1 staples 2 pulsenutfruitveg 3 animal 4 other
label values foodgroup foodgroups
replace foodgroup = 1 if inlist(itemcode,101,102,103,104,105,106,107,108,109,110,111,112,201,202,203,204,205,206,207)
replace foodgroup = 2 if inlist(itemcode,401,501,503,504,502,601,602,603,701,702,703)
replace foodgroup = 3 if inlist(itemcode,801,802,803,804,805,806,807,808,809,810,901,902,903)
replace foodgroup = 4 if inlist(itemcode,301,302,303,704,1001,1002,1003,1004,1101,1102,1103,1104,1105,1106,1107,1108)

tab foodgroup 

tempfile foodcons_item_wk
save `foodcons_item_wk', replace


** merge with prices

merge m:1 `admin0' `admin1' `admin2' `admin3' `ea' itemcode using `prices_purchased', keepusing(price_1_med price_3_med price_5_med)
tab _m
drop _m
drop if `hh'==""

foreach u in 1 3 5 {
	replace price_med = price_`u'_med if price_med==. & price_`u'_med!=. & unit==`u'
	}

replace foodcons = price_med * q_total if q_total > 0 & price_med==.
drop if inlist(itemcode,503,504,805,806,810,903,1103,1105,1108) /*These items are too rarely consumed*/


** Generate price unit values now

duplicates report `hh' itemcode // no duplicates  

keep `hh' itemcode foodgroup eat unit q_total q_purchase foodcons price_purchase price_med 

tempfile foodcons_item_wk
save `foodcons_item_wk', replace

save "${TEMP}/Foodcons_item_wk_W3.dta", replace

collapse (sum) foodcons, by(foodgroup `hh')

reshape wide foodcons, i(`hh') j(foodgroup)

su foodcons*, detail

foreach var in foodcons1 foodcons2 foodcons3 foodcons4 {
	replace `var' = `var'*52 /*annualized*/
	winsor2 `var', suffix(imp) cuts(0 99) 
	replace `var' = `var'imp 
	}

egen foodcons = rowtotal(foodcons1 foodcons2 foodcons3 foodcons4)

tempfile foodcons_hh
save `foodcons_hh', replace



*****************************
**** Expenditures GOODS  ****
*****************************

** Goods consumption -- includes food (section J), non-food one-week recall items (section K), non-food annual items (section L)

* Non-food goods consumption - from sections K and L (some are goods some are services)
* Section K is non-food one week 
	* --> Goods include cigarettes, matches, kerosene, gas, petrol, soap, toiletries, personal products, cleaning products, bulbs, 
* Section L is non-food 12 months 
	* --> Goods include carpets, linens, mats, mosquito nets, mattresses, sports equipment, film

import excel using "${LOCDRIVE}\Parameters\GoodServiceCodes.xlsx", clear firstrow sheet("Tanzania_R3")

tempfile goodservicecodes
save `goodservicecodes', replace

use "${IN}\HH_SEC_K.dta", clear
rename hh_k01 purchase
rename hh_k02 exp

tab purchase 
replace purchase = 1 if purchase==2 & exp >0 & exp != . //no cases
replace purchase = 1 if purchase==. & exp >0 & exp != .  //no cases

drop if purchase==2 

keep `hh' itemcode purchase exp
sort `hh'

tempfile 	 sectionL // this is not section L in this round but it is in the other two...
save 		`sectionL', replace

save "${TEMP}\SectionL_raw_R3", replace


use "${IN}\HH_SEC_L.dta", clear
rename hh_l01 purchase
rename hh_l02 exp /*value not expenditures on purchased*/
replace exp = hh_l03 if (itemcode == 318 | itemcode == 319) & (exp==0 | exp==.) & (hh_l03 != . | hh_l03 != 0) /*Bamboo and grass - exp reports value consumed and smq3 reports value purchased. Because both refer to the full 12 months, we use value consumed primarily.*/ 

drop if (purchase == 2) & (exp == . | exp == 0)

keep `hh' itemcode purchase exp
sort `hh'

append using `sectionL'

sort itemcode
merge m:1 itemcode using `goodservicecodes', nogen keep(1 3) keepusing(recall good service other good_* service_*)

winsor2 exp, suffix(imp) cuts(0 99) by(itemcode) /*winsorize top 1 percent*/

foreach type in service good other {
	g exp_`type' = expimp * `type' * (12/recall) /*recall is the number of months per recall period*/
	}

loc goodservicecollapselist ""
foreach type in `gooddisagglist' `servicedisagglist' {
	g exp_`type' = expimp * `type' * (12/recall) /*recall is the number of months per recall period*/
	loc goodservicecollapselist `goodservicecollapselist' exp_`type'
	}


**Export item level data for descriptive analysis later
sort `hh' itemcode
save "${TEMP}\Tanzania_GoodService_Items_R3.dta", replace

collapse (sum) exp_good exp_service exp_other `goodservicecollapselist', by(`hh')

sort `hh'
merge 1:1 `hh' using `sample', nogen keep(1 3)

winsor2 exp_good exp_service exp_other `goodservicecollapselist', suffix(imp) cuts(0 99) by(`admin1')  /*winsorize top 0.5 percent by region*/

foreach g in good service other `gooddisagglist' `servicedisagglist' {
	rename exp_`g'imp exp_`g'_secLM
	drop exp_`g'
	}

sort `hh'
tempfile goodsandservices
save `goodsandservices', replace


********************************
**** Expenditures SERVICES  ****
********************************

** Services consumption
* Section F is food outside hh - 7 day value of expenditures by member on prepared food and bar and local brews --> YES
* Section I is housing --> rent? housing materials? use of water vender? --> YES
* Section C is education
* Section D is health, Children elsewhere section doesn't exist, Section O is assistance and groups, Section P is credit, [Section Q is crime/justice?], Section R is shocks, Section S is deaths

* Food outside of house consumption (Section F)

use "${IN}\HH_SEC_F.dta", clear
keep `hh' `indiv' hh_f*

sort `hh'
merge m:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1' `admin0')

drop if hh_f01==2 | hh_f01==. /*no data are lost here*/

*3: 3 exp obs w 2 not coded to 1; 5: none; 7: none; 9: none; 11: one as no; 13: one as no; 15: one as no
foreach k in 3 5 7 9 11 13 15 {
	loc j = `k' - 1
	if `k'<10 {
	    loc kind 0`k'
		loc jind 0`j'
		}
		else if `k'>10 {
		    loc kind `k'
			loc jind `j'
			}
	bysort hh_f`jind': su hh_f`kind', detail
	replace hh_f`kind' = 0 if (hh_f`kind' == 99 | hh_f`kind' == 9) & hh_f`jind' != 1  /*no changes made*/
	}

egen exp_service_food = rowtotal(hh_f03 hh_f05 hh_f07 hh_f09 hh_f11 hh_f13 hh_f15), missing /*3 missing vals are zeros across the board*/

winsor2 exp_service_food, suffix(imp) cuts(0 99) by(`admin1' `admin0')
replace exp_service_food = exp_service_foodimp * 52 /*recall period is one week*/

collapse (sum) exp_service_food, by(`hh' `admin1' `admin0')

winsor2 exp_service_food, suffix(imp) cuts(0 99) by(`admin1'  `admin0')
replace exp_service_food = exp_service_foodimp

sort `hh' 
tempfile 	 services_food
save 		`services_food', replace


* Rent and housing (Section I)

use "${IN}\HH_SEC_I.dta", clear

keep `hh' hh_i*
sort `hh'
merge 1:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1' `admin0')

rename hh_i03 exp_rent
recode exp_rent (.=0)
replace exp_rent = exp_rent * 12 /*annualized*/

winsor2 exp_rent, suffix(imp) cuts(0 99) by(`admin1')
replace exp_rent = exp_rentimp

rename exp_rent exp_service_rent

keep `hh' exp_service_rent
sort `hh'

tempfile	 services_rent
save		`services_rent', replace


* Education (Section C)

use "${IN}\HH_SEC_C.dta", clear

keep `hh' `indiv' hh_c28_* /*these are broken out but the tot variable is the sum of the components in all but 59 cases (6070 they are the same). Could be a difference between cash and in kind (total includes in kind)*/
sort `hh'
merge m:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1')

egen exp_good_educ = rowtotal(hh_c28_2 hh_c28_3), missing /*books and uniform*/
egen exp_service_educ = rowtotal(hh_c28_1 hh_c28_4 hh_c28_5 hh_c28_6 hh_c28_7), missing /*fees, transportation, tuition, other, food */

winsor2 exp_service_educ exp_good_educ, suffix(imp) cuts(0 99) by(`admin1')
replace exp_service_educ = exp_service_educimp
replace exp_good_educ = exp_good_educimp

recode exp_service_educ exp_good_educ (.=0)
collapse (sum) exp_service_educ exp_good_educ, by(`hh')

keep `hh' exp_service_educ exp_good_educ
sort `hh'

tempfile exp_educ
save `exp_educ', replace


* Health (Section D)
use "${IN}\HH_SEC_D.dta", clear

keep `hh' `indiv' hh_d* /*paper questionnaire questions are different than computer*/
sort `hh'
merge m:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1')

egen service_health_4w = rowtotal(hh_d07 hh_d09), missing /*monthly*/
egen service_health_hos = rowtotal(hh_d13 hh_d15), missing /*annual hospital and healer expenses*/
g exp_good_health = hh_d08 /*monthly*/

winsor2 service_health_4w service_health_hos exp_good_health, suffix(imp) cuts(0 99) by(`admin1')
foreach var in service_health_4w service_health_hos exp_good_health {
	replace `var' = `var'imp
	recode `var' (.=0)
	}

replace exp_good_health = exp_good_health * (52/4)
replace service_health_4w = service_health_4w * (52/4)

egen exp_service_health = rowtotal(service_health_4w service_health_hos)

collapse (sum) exp_service_health exp_good_health, by(`hh')

keep `hh' exp_service_health exp_good_health
sort `hh'

tempfile exp_health
save `exp_health', replace


*********************
**** Prices FOOD ****
*********************

** Use mean in base period and district as base #fixfixfix should I use the sample-wide mean instead?
*https://www.stata.com/statalist/archive/2003-01/msg00076.html
/*items dropped from HDS: paddy rice 101  green cob maize 103 cashew almonds other nuts 503 canned dried and wild vegetables 603 package fish 810 prepared tea coffee 1105 */

** Now generate the Fisher price indexes

use `foodcons_item_wk', clear
g unitv = foodcons/q_total
merge m:1 `hh' using `sample', nogen keepusing(urban `admin0' `admin1' `admin2' `admin3' `weight')

tempfile foodcons_item_sample
save `foodcons_item_sample', replace

use `foodcons_item_sample', clear
merge m:1 itemcode using "${TEMP}\Base_qty_price_adm0.dta", nogen keepusing(x0_adm0 p0_adm0)

g laspeyeres_num = price_med*x0_adm0
g laspeyeres_denom = p0_adm0*x0_adm0
g paasche_num = price_med*q_total
g paasche_denom = p0_adm0*q_total 

collapse (sum) laspeyeres_num laspeyeres_denom paasche_num paasche_denom eat, by(`hh')

g fisher = sqrt((laspeyeres_num/laspeyeres_denom)*(paasche_num/paasche_denom))

winsor2 fisher, suffix(imp) cuts(1 99)
replace fisher=fisherimp

rename fisher pr_foodtot

g lpr_food = log(pr_foodtot)


** Save dataset

keep `hh' lpr_food pr_foodtot 
sort `hh'  

tempfile 	 prices_foodtot
save 		`prices_foodtot', replace

use `sample', clear
merge 1:1 `hh' using `prices_foodtot',  nogen keep(1 3)

tempfile 	 prices_foodtot
save 		`prices_foodtot', replace


********************************
**** Total Expenditures ****
********************************

use "${IN}\ConsumptionNPS3.dta", clear
keep `hh' expm foodIN foodOUT utilities hhexpenses health transport communic recreat educa other hhsize  adulteq

rename hhsize hhsize_adeq

*egen exp_sagg_other = rowtotal(other)

g pcexp = expm / hhsize /*annual nominal expenditures*/
g pcexp_adeq = expm / hhsize_adeq
rename expm hhexp

winsor2 hhexp pcexp pcexp_adeq, suffix(imp) cuts(0 99)
replace pcexp=pcexpimp
replace pcexp_adeq=pcexp_adeqimp
replace hhexp=hhexpimp 

keep `hh' hhexp pcexp pcexp_adeq hhsize hhsize_adeq /*exp_sagg**/
sort `hh'

tempfile	 expenditures
save		`expenditures', replace
*


***********************
**** Covariates ****
***********************/

** Education variables

use "${DATADRIVE}\OtherRaw\TimeuseAll_indiv_${YEAR}.dta" 

g educ_yrs_head = educ_yrs if relationtohead==1
g female_head = female if relationtohead==1
g age_head = age if relationtohead==1
g educ_yrs_spouse = educ_yrs if relationtohead==2 
g age_spouse = age if relationtohead==2

sort `hh' relationtohead
collapse (firstnm) female_head age_head educ_yrs_head (max) educ_yrs_max=educ_yrs educ_yrs_spouse age_spouse, by(`hh')
count if female_head==. 
count if age_head==.
count if educ_yrs_head==.
replace educ_yrs_head = educ_yrs_spouse if educ_yrs_head==.
replace age_head = age_spouse if age_head==.

sort `hh'
keep `hh' educ_yrs_head educ_yrs_max

tempfile educ
save `educ', replace


** Dependency Variables

use "${DATADRIVE}\OtherRaw\TimeuseAll_indiv_${YEAR}.dta", clear
keep `hh' `indiv' age tobedropped

sort `hh' `indiv'
merge 1:1 `hh' `indiv' using "${IN}\HH_SEC_B.dta", keepusing(hh_b04)
replace age = hh_b04 if age==.
preserve

g pop_dep = ( age < 15 | age > 65 ) & age != . & tobedropped==0
g pop_indep = ( age >= 15 & age <= 65 ) & age != . & tobedropped==0

collapse (sum) pop_dep pop_indep, by(`hh')
count if pop_dep==.
count if pop_dep==0 & pop_indep==0
count if pop_indep==.
count if pop_indep==0

g hh_depshare = (pop_dep + 0.1) / (pop_dep + pop_indep + 0.1) /*the 0.1 is to prevent dividing by zero if age missing from all*/
count if hh_depshare==.
	
keep `hh' hh_depshare
tempfile depshare
save `depshare', replace


** Participation Variables 

use "${DATADRIVE}\OtherRaw\TimeuseAll_hh_${YEAR}.dta", clear 

g pcmh_farm = p_ownfarm==1 & h_ownfarm > 35 /*"Min hours participation" participation - at least 35 hours per year*/
g pcmh_ent = p_ownent == 1 & h_ownent > 35
g pcmh_market = p_wagelbr == 1 & h_wagelbr > 35


keep `hh' pcmh_* 
tempfile pcmh
save `pcmh', replace


** Asset index

use "${IN}\HH_SEC_M.dta", clear 
/* Asset list same as R1
         401 Radio and Radio Cassette - yes
         402 Telephone(landline) - yes
         403 Telephone(mobile) - yes
         404 Refrigerator or freezer - yes
         405 Sewing Machine - yes
         406 Television - yes
         407 Video / DVD - yes
         408 Chairs - yes
         409 Sofas - yes
         410 Tables - yes
         411 Watches - yes
         412 Beds - yes
         413 Cupboards, chest-of-drawers, boxes, wardrobes, bookcases - yes
         414 Lanterns - yes
         415 Computer - yes
         416 Cooking pots, Cups, other kitchen utensils - yes
         417 Mosquito net - yes
         418 Iron (Charcoal or electric) - yes
         419 Electric/gas stove - yes
         420 Other stove - yes
         421 Water-heater - yes
         422 Record/cassette player, tape recorder - yes
         423 Complete music system - yes
         424 Books (not school books) - yes
         425 Motor Vehicles - yes
         426 Motor cycle - yes
         427 Bicycle - yes
         428 Carts
         429 Animal Cart
         430 Boat/canoe
         431 Wheel barrow
         432 Livestock
         433 Poultry
         434 Outboard engine
         435 Donkeys
         436 Fields/Land
         437 House(s) - yes
         438 Air-conditioned - yes
         439 Dish antenna/decoder - yes
         440 Hoes
         441 Spraying machine
         442 Water pumping set
         443 Reapers
         444 Tractor
         445 Trailer for tractors etc.
         446 Plough etc.
         447 Harrow
         448 Milking machine
         449 Harvesting and threshing machine
         450 Hand milling machine
         451 Coffee pulping machine
         452 Fertilizer distributor
*/

loc assetlist 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 437 438 439 

g keep=0
foreach asset in `assetlist' {
	replace keep = 1 if itemcode==`asset'
	}

keep if keep==1
drop keep occ

rename hh_m01 count_ 
rename hh_m02 age_
rename hh_m03 purchpr_
rename hh_m04 valuenow_
reshape wide count_ age_ purchpr_ valuenow_, i(`hh') j(itemcode)

foreach asset in `assetlist' {
	replace count_`asset' = 0 if count_`asset'==.
	}

order y3_hhid count_* age_* purchpr_* valuenow_*
pca count_401 - count_439

predict wealthscore

keep `hh' wealthscore
la var wealthscore "Consumptive asset index"

tempfile wealthscore
save `wealthscore', replace

merge 1:1 `hh' using `wealthscore', nogen keep(1 3)
sort `hh'

 
** Build the covariates dataset

use `sample', clear
sort `hh'
merge 1:1 `hh' using `educ', nogen keep(1 3) keepusing(educ_yrs_head educ_yrs_max)
sort `hh'
merge 1:1 `hh' using `depshare', nogen keep(1 3) keepusing(hh_depshare)
sort `hh'
merge 1:1 `hh' using `pcmh', nogen keep(1 3) keepusing(pcmh_farm pcmh_ent pcmh_market)
sort `hh'
merge 1:1 `hh' using `wealthscore', nogen keep(1 3) keepusing(wealthscore)

tempfile covariates
save `covariates', replace


*********************************
**** Dataset Assembley ****
********************************/

* Put tegether full dataset

use `sample', clear
merge 1:1 `hh' using `foodcons_hh', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `goodsandservices', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `services_food', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `services_rent', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `exp_educ', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `exp_health', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `expenditures', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `covariates', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `intdate', nogen keep(1 3)
sort `hh'
save "${TEMP}\Tanzania_HH_R3.dta", replace
*

**************
log close

